#modify datatype
df1['DATE'] =pd.to_datetime(df1['DATE'])
#extract Year and Month number as String - YYYYMM format
df1['month_id'] = df1['DATE'].dt.strftime('%Y%m')
#define metrics: For each store and month
# total sales, number of customers,Transactions,Total Quantity
grouped = df1.groupby(['STORE_NBR','month_id'])
metrics = grouped.agg(
totSales = ('TOT_SALES','sum'),
nCustomers = ('LYLTY_CARD_NBR', 'nunique'),
nTranactions=('TXN_ID','nunique'),
totalqty = ('PROD_QTY','sum')
).reset_index()
#transactions per customer
#chips per customer
#Average Price per unit
metrics['nTxnPerCust'] = metrics['nTranactions'] / metrics['nCustomers']
metrics['nChipsPerTxn'] = metrics['totalqty'] / metrics['nTranactions']
metrics['avgPricePerUnit'] = metrics['totSales'] / metrics['totalqty']
#month_id to integer
metrics['month_id'] = metrics['month_id'].astype(int)
#stores with 12 full months
store_month_counts = metrics.groupby('STORE_NBR')['month_id'].nunique()
stores_with_full_obs = store_month_counts[store_month_counts == 12].index.tolist()
#Pre-trial period (< 2019 02)
pre_trial_measures = metrics[
(metrics['month_id'] < 201902) &
(metrics['STORE_NBR'].isin(stores_with_full_obs))
# Function to calculate correlation for a measure.
# Looping through each control store.
def calculate_correlation(input_df, metric_col, store_comparison):
# Get all other stores (control candidates)
store_numbers = input_df['STORE_NBR'].unique()
store_numbers = [store for store in store_numbers if store != store_comparison]
corr_results = []
for store in store_numbers:
# Filter data for each store
trial_data = input_df[input_df['STORE_NBR'] == store_comparison][['month_id', metric_col]]
control_data = input_df[input_df['STORE_NBR'] == store][['month_id', metric_col]]
# Merge on month_id
merged = pd.merge(trial_data, control_data, on='month_id', suffixes=('_trial', '_control'))
# Calculate correlation
corr = merged[f'{metric_col}_trial'].corr(merged[f'{metric_col}_control'])
# Save result
corr_results.append({
'Store1': store_comparison,
'Store2': store,
'corr_measure': corr
})
return pd.DataFrame(corr_results)
correlation_table = calculate_correlation(metrics, 'totSales', 77)
print(correlation_table.head())
#A standardised metric based on the
#absolute difference between the trial store's performance
#and each control store's performance.
def calculate_magnitude_distance(input_df, metric_col, store_comparison):
"""
input_df: DataFrame with columns ['STORE_NBR', 'month_id', <metric_col>]
metric_col: str, name of the metric column to compare (e.g. 'totSales')
store_comparison: int, the trial store number
"""
control_stores = input_df['STORE_NBR'].unique()
control_stores = [s for s in control_stores if s != store_comparison]
result_rows = []
for store in control_stores:
# Filter data for each store
trial_data = input_df[input_df['STORE_NBR'] == store_comparison][['month_id', metric_col]]
control_data = input_df[input_df['STORE_NBR'] == store][['month_id', metric_col]]
# Merge on month_id
merged = pd.merge(trial_data, control_data, on='month_id', suffixes=('_trial', '_control'))
# Calculate absolute difference
merged['measure'] = abs(merged[f'{metric_col}_trial'] - merged[f'{metric_col}_control'])
# Add store numbers to result
merged['Store1'] = store_comparison
merged['Store2'] = store
# Only keep necessary columns
result_rows.append(merged[['Store1', 'Store2', 'month_id', 'measure']])
# Combine all results
calc_dist_df = pd.concat(result_rows, ignore_index=True)
return calc_dist_df